import pandas as pd
import numpy as nm
import streamlit as st
import matplotlib.pyplot as plt
import seaborn as sns
df=pd.read_csv("corona.csv")
df.head()
| Id | Location | Weekly Cases | Year | Weekly Cases per Million | Weekly Deaths | Weekly Deaths per Million | Total Vaccinations | People Vaccinated | People Fully Vaccinated | Total Boosters | Daily Vaccinations | Total Vaccinations per Hundred | People Vaccinated per Hundred | People Fully Vaccinated per Hundred | Total Boosters per Hundred | Daily Vaccinations per Hundred | Daily People Vaccinated | Daily People Vaccinated per Hundred | Next Week's Deaths | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 911530868 | World | 2372.0 | 2020 | 0.300 | 65.0 | 0.008 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 344 |
| 1 | 807936902 | World | 5023.0 | 2020 | 0.635 | 114.0 | 0.014 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 361 |
| 2 | 773590408 | World | 5612.0 | 2020 | 0.710 | 116.0 | 0.015 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 431 |
| 3 | 130466459 | World | 7580.0 | 2020 | 0.958 | 153.0 | 0.019 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 463 |
| 4 | 544040446 | World | 8983.0 | 2020 | 1.136 | 187.0 | 0.024 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 506 |
print(df.shape)
df.isnull().sum()
(129156, 20)
Id 0 Location 0 Weekly Cases 213 Year 0 Weekly Cases per Million 868 Weekly Deaths 1258 Weekly Deaths per Million 1909 Total Vaccinations 88886 People Vaccinated 90727 People Fully Vaccinated 92582 Total Boosters 109309 Daily Vaccinations 51316 Total Vaccinations per Hundred 88886 People Vaccinated per Hundred 90727 People Fully Vaccinated per Hundred 92582 Total Boosters per Hundred 109309 Daily Vaccinations per Hundred 51316 Daily People Vaccinated 51765 Daily People Vaccinated per Hundred 51765 Next Week's Deaths 0 dtype: int64
df["Total Vaccinations"]=(df["People Vaccinated"]+df["People Fully Vaccinated"])
df["Total Vaccinations"].isnull().sum()
df["Total Vaccinations"].fillna("0")
df.dropna(inplace=True)
print(df.shape)
df.isnull().sum()
(18728, 20)
Id 0 Location 0 Weekly Cases 0 Year 0 Weekly Cases per Million 0 Weekly Deaths 0 Weekly Deaths per Million 0 Total Vaccinations 0 People Vaccinated 0 People Fully Vaccinated 0 Total Boosters 0 Daily Vaccinations 0 Total Vaccinations per Hundred 0 People Vaccinated per Hundred 0 People Fully Vaccinated per Hundred 0 Total Boosters per Hundred 0 Daily Vaccinations per Hundred 0 Daily People Vaccinated 0 Daily People Vaccinated per Hundred 0 Next Week's Deaths 0 dtype: int64
df["Weekly Cases per Million"]=df["Weekly Cases"]/1000000
df["Weekly Deaths per Million"]=df["Weekly Deaths"]/1000000
df["Total Boosters per Hundred"]=df["Total Boosters"]/100
df["Total Vaccinations per Hundred"]=df["Total Vaccinations"]/100
df["People Vaccinated per Hundred"]=df["People Vaccinated"]/100
df["People Fully Vaccinated per Hundred"]=df["People Fully Vaccinated"]/100
df["Daily Vaccinations per Hundred"]=df["Daily Vaccinations"]/100
df["Daily People Vaccinated per Hundred"]=df["Daily People Vaccinated"]/100
df.head()
| Id | Location | Weekly Cases | Year | Weekly Cases per Million | Weekly Deaths | Weekly Deaths per Million | Total Vaccinations | People Vaccinated | People Fully Vaccinated | Total Boosters | Daily Vaccinations | Total Vaccinations per Hundred | People Vaccinated per Hundred | People Fully Vaccinated per Hundred | Total Boosters per Hundred | Daily Vaccinations per Hundred | Daily People Vaccinated | Daily People Vaccinated per Hundred | Next Week's Deaths | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 241 | 275164452 | World | 4174523.0 | 2020 | 4.174523 | 77527.0 | 0.077527 | 7276178.0 | 7231498.0 | 44680.0 | 1.0 | 897447.0 | 72761.78 | 72314.98 | 446.80 | 0.01 | 8974.47 | 690726.0 | 6907.26 | 81042 |
| 242 | 857254713 | World | 4424216.0 | 2021 | 4.424216 | 79456.0 | 0.079456 | 9109346.0 | 9050886.0 | 58460.0 | 9.0 | 1079269.0 | 91093.46 | 90508.86 | 584.60 | 0.09 | 10792.69 | 735617.0 | 7356.17 | 92754 |
| 243 | 515683834 | World | 4553174.0 | 2021 | 4.553174 | 80332.0 | 0.080332 | 11535235.0 | 11343354.0 | 191881.0 | 15.0 | 1303377.0 | 115352.35 | 113433.54 | 1918.81 | 0.15 | 13033.77 | 851085.0 | 8510.85 | 94477 |
| 244 | 725478352 | World | 4619286.0 | 2021 | 4.619286 | 79640.0 | 0.079640 | 12944964.0 | 12578084.0 | 366880.0 | 23.0 | 1397939.0 | 129449.64 | 125780.84 | 3668.80 | 0.23 | 13979.39 | 845521.0 | 8455.21 | 96212 |
| 245 | 844503137 | World | 4649535.0 | 2021 | 4.649535 | 81042.0 | 0.081042 | 14652786.0 | 14002427.0 | 650359.0 | 27.0 | 1581369.0 | 146527.86 | 140024.27 | 6503.59 | 0.27 | 15813.69 | 928498.0 | 9284.98 | 96742 |
df.drop("Id",axis=1,inplace=True)
df.drop("Location",axis=1,inplace=True)
df.columns
Index(['Weekly Cases', 'Year', 'Weekly Cases per Million', 'Weekly Deaths',
'Weekly Deaths per Million', 'Total Vaccinations', 'People Vaccinated',
'People Fully Vaccinated', 'Total Boosters', 'Daily Vaccinations',
'Total Vaccinations per Hundred', 'People Vaccinated per Hundred',
'People Fully Vaccinated per Hundred', 'Total Boosters per Hundred',
'Daily Vaccinations per Hundred', 'Daily People Vaccinated',
'Daily People Vaccinated per Hundred', 'Next Week's Deaths'],
dtype='object')
df.drop("Year",axis=1,inplace=True)
import seaborn as sns
sns.pairplot(data=df, diag_kind='kde')
C:\Users\hp\anaconda3\Lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
<seaborn.axisgrid.PairGrid at 0x2260abef7d0>
sns.heatmap(df[['Weekly Cases', 'Weekly Deaths', 'Total Vaccinations', 'People Vaccinated',
'People Fully Vaccinated', 'Total Boosters', 'Daily Vaccinations', 'Daily People Vaccinated', "Next Week's Deaths"]].corr(), cmap='Blues', annot=True)
plt.show()
#removing/reducing the regressor attributes that have less correlation to the response variable
df.drop("Total Boosters",axis=1,inplace=True)
df.drop("People Fully Vaccinated",axis=1,inplace=True)
df.drop("People Vaccinated",axis=1,inplace=True)
df.drop("Total Vaccinations",axis=1,inplace=True)
x=df[['Weekly Cases', 'Weekly Deaths','Daily Vaccinations', 'Daily People Vaccinated']].fillna("0")
y=df["Next Week's Deaths"]
x,y
( Weekly Cases Weekly Deaths Daily Vaccinations \
241 4174523.0 77527.0 897447.0
242 4424216.0 79456.0 1079269.0
243 4553174.0 80332.0 1303377.0
244 4619286.0 79640.0 1397939.0
245 4649535.0 81042.0 1581369.0
... ... ... ...
129149 554.0 15.0 7129.0
129151 464.0 13.0 5665.0
129152 471.0 12.0 5295.0
129154 277.0 6.0 5358.0
129155 277.0 6.0 6190.0
Daily People Vaccinated
241 690726.0
242 735617.0
243 851085.0
244 845521.0
245 928498.0
... ...
129149 2177.0
129151 1427.0
129152 1362.0
129154 1633.0
129155 2102.0
[18728 rows x 4 columns],
241 81042
242 92754
243 94477
244 96212
245 96742
...
129149 6
129151 9
129152 7
129154 7
129155 8
Name: Next Week's Deaths, Length: 18728, dtype: int64)
x.columns
Index(['Weekly Cases', 'Weekly Deaths', 'Daily Vaccinations',
'Daily People Vaccinated'],
dtype='object')
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
x=sc.fit_transform(x)
#y=sc.transform(y)
x,y
(array([[ 2.36390907, 6.5447199 , -0.13662119, 0.05656583],
[ 2.5245435 , 6.71682839, -0.099713 , 0.07874118],
[ 2.60750575, 6.79498652, -0.05422114, 0.13578029],
...,
[-0.32137423, -0.37129315, -0.31771983, -0.28396757],
[-0.32149904, -0.37182847, -0.31770705, -0.2838337 ],
[-0.32149904, -0.37182847, -0.31753816, -0.28360203]]),
241 81042
242 92754
243 94477
244 96212
245 96742
...
129149 6
129151 9
129152 7
129154 7
129155 8
Name: Next Week's Deaths, Length: 18728, dtype: int64)
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42)
from sklearn.linear_model import LinearRegression
lr=LinearRegression()
model=lr.fit(x_train,y_train)
y_pred=model.predict(x_test)
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
#Metrics to evaluate your model
r2_score(y_test, y_pred)*100, mean_absolute_error(y_test, y_pred), nm.sqrt(mean_squared_error(y_test, y_pred))
(98.91008558095014, 411.1428851158433, 1164.553738645361)
import pickle
pickle_out = open("model.pkl", "wb")
pickle.dump(model, pickle_out)
pickle_out.close()